The following is not meant to be a source of truth in terms of data integrity, but rather to illustrate different ways of displaying, engineering, and/or presenting the data.
# load packages, install if needed
packages = c(
"dplyr"
, "ggplot2"
, "formattable"
, "plotly"
, "RColorBrewer"
, "scales"
, "stringr"
, "tidyr"
, "ElmeR"
, "RJDBC"
, "kableExtra"
, "wesanderson"
, "reshape2"
, "rtweet"
, "tidytext"
, "lubridate"
, "wordcloud"
, "ggpubr"
, "ggthemes"
, "knitrBootstrap"
, "DT"
, "MatchIt"
, "beyonce"
, "UpSetR"
, "gganimate"
, "wordcloud2"
, "widyr"
, "ggraph"
, "igraph"
, "aod"
, "corrplot"
, "ROCR"
, "InformationValue"
, "car"
, "glmnet"
, "caret"
, "kernlab"
, "pdp"
, "rpart.plot"
, "rpart"
, "e1071"
)
package.check <- lapply(packages, FUN = function(x) {
if (!require(x, character.only = TRUE)) {
install.packages(x, dependencies = TRUE)
library(x, character.only = TRUE)
}
})
#import the data
library(readr)
us_county_stats <- read_csv("datasets_579969_1220276_us_county.csv")
brutality_cases <- read_csv("police_brutality.csv")
# glimpse the data
# us_county_stats %>%
# head()
#
# brutality_cases %>%
# head()
#missing values
colMeans(is.na(brutality_cases))
Timestamp Date of the Police Brutality
1.0000000000 0.0000000000
Approximate Time of the Police Brutality State where the Police Brutality Occurred
1.0000000000 0.0000000000
City where the Police Brutality Occurred Video URL of the Police Brutality
0.0007829832 0.4519117839
Names of Victims Involved Names of Police Officers Involved
0.0000000000 1.0000000000
Badge Numbers of Police Officers involved Enter any extra information you want to share here.
1.0000000000 1.0000000000
Category test
1.0000000000 1.0000000000
County Category_1
0.0000000000 1.0000000000
summary(us_county_stats)
fips county state state_code male female median_age
Min. : 1001 Length:3220 Length:3220 Length:3220 Min. : 38 Min. : 37 Min. :21.70
1st Qu.:19032 Class :character Class :character Class :character 1st Qu.: 5658 1st Qu.: 5573 1st Qu.:38.10
Median :30024 Mode :character Mode :character Mode :character Median : 12916 Median : 12996 Median :41.20
Mean :31394 Mean : 49875 Mean : 51457 Mean :41.28
3rd Qu.:46106 3rd Qu.: 33248 3rd Qu.: 33531 3rd Qu.:44.30
Max. :72153 Max. :4976788 Max. :5121264 Max. :67.00
population female_percentage lat long
Min. : 75 Min. :21.00 Min. :17.98 Min. :-164.03
1st Qu.: 11214 1st Qu.:49.43 1st Qu.:34.35 1st Qu.: -98.09
Median : 25950 Median :50.42 Median :38.21 Median : -89.95
Mean : 101332 Mean :49.96 Mean :37.97 Mean : -91.64
3rd Qu.: 66552 3rd Qu.:51.15 3rd Qu.:41.69 3rd Qu.: -82.99
Max. :10098052 Max. :58.61 Max. :69.31 Max. : -65.29
# Clean dataset (drop nulls)
brutality_clean = brutality_cases
# Remove columns with more than 50% NA
brutality_clean = brutality_clean[, which(colMeans(!is.na(brutality_clean)) > 0.5)]
# Change columns to snake case
snake_case <- function(x) {
colnames(x) <- gsub(" ", "_", colnames(x));x
colnames(x) <-tolower(colnames(x))
}
colnames(brutality_clean) = snake_case(brutality_clean)
brutality_clean = brutality_clean %>%
rename(
"state" = state_where_the_police_brutality_occurred
, "city" = city_where_the_police_brutality_occurred
, "date" = date_of_the_police_brutality
) %>%
mutate(
state_county = paste0(state, "-", county)
)
brutality_clean$date = as.Date(brutality_clean$date, "%m/%d/%Y")
us_stats_clean = us_county_stats
us_stats_clean$county = str_replace(us_stats_clean$county, " County", "")
us_stats_clean = us_stats_clean %>%
group_by(state) %>%
mutate(state_pop = sum(population)) %>%
rename(
"state_full" = state
, "state" = state_code
) %>%
ungroup()
# us_stats_clean =
us_stats_clean = us_stats_clean %>%
mutate(state_county = paste0(state, "-", county)) %>%
select(state_pop, state) %>%
unique()
df_merged = brutality_clean %>%
left_join(us_stats_clean) %>%
mutate(state_pop = ifelse(state == "DC", 705749, state_pop)
) %>%
unique()
df_merged = df_merged %>%
group_by(
state
) %>%
add_tally() %>%
ungroup() %>%
rename(
"state_totals" = n
) %>%
mutate(
state_pop_millions = state_pop/1000000
, totals_per_million = state_totals/state_pop_millions
)
df_plot = df_merged %>%
select(
state, state_pop, state_pop_millions, state_totals, totals_per_million
) %>%
unique() %>%
arrange(desc(totals_per_million)) %>%
mutate(
state = reorder(as.factor(state), totals_per_million)
)
a = df_plot %>%
plot_ly(
y = ~state
, x = ~totals_per_million
, type = "bar"
, color = paired_better[1]
, orientation = "h"
, width = 900
, height = 1000
, hoverinfo = "text"
, text = ~paste(
"State: ", state
, "<br>State Population: ", comma(state_pop)
, "<br>Reported Incidents: ", state_totals
, "<br>Incidents per Million Residents: ", comma(totals_per_million)
)
) %>%
layout(
xaxis = list(title = "Reported Incidents per Million State Residents")
, yaxis = list(title = "")
)
b = df_plot %>%
plot_ly(
y = ~state
, x = ~state_totals
, type = "bar"
, color = paired_better[10]
, orientation = "h"
, width = 900
, height = 1000
, hoverinfo = "text"
, showlegend = FALSE
, text = ~paste(
"State: ", state
, "<br>State Population: ", comma(state_pop)
, "<br>Reported Incidents: ", state_totals
, "<br>Incidents per Million Residents: ", comma(totals_per_million)
)
) %>%
layout(
xaxis = list(title = "Total Reported Incidents")
, yaxis = list(title = "")
)
subplot(a, b, titleX = TRUE)
There are some counties with null values when it comes to population. We can obviously find these via a better dataset, but for now I’m going to exclude them in order to provide examples with the data that we have. Obviously, this is not accurate reporting.
df_filtered = df_merged
df_filtered = df_filtered %>%
left_join(us_stats_clean) %>%
filter(!is.na(population))
df_filtered = df_filtered %>%
group_by(state, county) %>%
add_tally() %>%
rename("county_totals" = n) %>%
arrange(desc(county_totals)) %>%
mutate("incidents_per_capita" = county_totals/(population)) %>%
ungroup()
df_county = df_filtered %>%
select(state, state_county, county, state_full, state_totals, median_age, population, county_totals, incidents_per_capita) %>%
unique() %>%
arrange(desc(incidents_per_capita))
df_county %>%
mutate(
incidents_per_capita = comma(incidents_per_capita, .000001)
, population = comma(population)
) %>%
datatable()